knitr::opts_chunk$set(echo = TRUE)

I will show you how to use text data to build word clouds in R. I saved all the AWE Basecamp discussions to a Word file. I then save the Word file as text.

We will require three packages for this: tm, SnowballC, and wordcloud.

First, let’s load the required libraries and read in the data.

library(tm)
library(SnowballC)
library(wordcloud)

jeopQ <- read.csv("../data/Words/BasecampArchive.txt", stringsAsFactors = FALSE)

Now, we will perform a series of operations on the text data to simplify it. First, we need to create a corpus.

jeopCorpus <- Corpus(VectorSource(jeopQ))

Next, we will convert the corpus to a plain text document.

jeopCorpus <- tm_map(jeopCorpus, PlainTextDocument)

Then, we will remove all punctuation and stopwords. Stopwords are commonly used words in the English language such as I, me, my, etc. You can see the full list of stopwords using stopwords('english'). I also selectively removed the and this.

jeopCorpus <- tm_map(jeopCorpus, removePunctuation)
jeopCorpus <- tm_map(jeopCorpus, removeWords, c('october', 'can', 'will', 'post', 'just', 'the', 'this', '2015', stopwords('english')))

Next, we will perform stemming. This means that all the words are converted to their stem (Ex: learning -> learn, walked -> walk, etc.). This will ensure that different forms of the word are converted to the same form and plotted only once in the wordcloud.

jeopCorpus <- tm_map(jeopCorpus, stemDocument)

Now, we will plot the wordcloud.

wordcloud(jeopCorpus, max.words = 100, random.order = FALSE, colors = brewer.pal(5,"Greens"))

There are a few ways to customize it.

There is an even easier way! See here.

The following packages are required for the rquery.wordcloud() function:

If needed, install these packages, before using the function rquery.wordcloud, as follows:

install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RCurl", "XML")
library(tm)
library(SnowballC)
library(wordcloud)
library(RCurl)
library(XML)

source('http://www.sthda.com/upload/rquery_wordcloud.r')

The format of rquery.wordcloud() function is shown below :

rquery.wordcloud(x, type=c("text", "url", "file"), 
        lang="english", excludeWords = NULL, 
        textStemming = FALSE,  colorPalette="Dark2",
        max.words=200)

Let's run it:

#Let's create a list of excluded words
badWords <- c("also", "see", "may", "using", "make", "can", "will", "want", "know", "posted", "really", "october")
res<-rquery.wordcloud("../data/Words/BasecampArchive.txt", type ="file", lang = "english", min.freq = 5, max.words = 100,
                      excludeWords = badWords)

Here are the parameters:

Note rquery.wordcloud() function returns a list, containing two objects: tdm: term-document matrix which can be explored as illustrated in the next sections. freqTable: Frequency table of words

Change the arguments max.words and min.freq to plot more words:

res<-rquery.wordcloud("../data/Words/BasecampArchive.txt", type ="file", lang = "english", min.freq = 3,  max.words = 200,
                      excludeWords = badWords)
Change the color of the word cloud

The color of the word cloud can be changed using the argument colorPalette.

Allowed values for colorPalete:

# Reds color palette
res <- rquery.wordcloud("../data/Words/BasecampArchive.txt", type ="file", lang = "english",
                      colorPalette = "Reds", excludeWords = badWords)

# RdBu color palette
res<-rquery.wordcloud("../data/Words/BasecampArchive.txt", type ="file", lang = "english",
                      colorPalette = "RdBu", excludeWords = badWords)

# use unique color
res<-rquery.wordcloud("../data/Words/BasecampArchive.txt", type ="file", lang = "english",
                      colorPalette = "black", excludeWords = badWords)

Operations on the result of rquery.wordcloud() function

As mentioned above, the result of rquery.wordcloud() is a list containing two objects:

tdm <- res$tdm
freqTable <- res$freqTable
Frequency table of words

The frequency of the first top words can be displayed and plotted as follows:

# Show the top10 words and their frequency
head(freqTable, 10)
# Bar plot of the frequency for the top10
barplot(freqTable[1:10,]$freq, las = 2, names.arg = freqTable[1:10,]$word,
        col ="lightblue", main ="Most frequent words", ylab = "Word frequencies")
Operations on term-document matrix

You can explore the frequent terms and their associations. In the following example, we want to identify words that occur at least four times:

findFreqTerms(tdm, lowfreq = 20)

You could also analyze the correlation (or association) between frequent terms. The R code below identifies which words are associated with “campaign”:

findAssocs(tdm, terms = "campaign", corlimit = 0.3)

Create a word cloud of a web page

In this section we’ll make a tag cloud of the following web page :

http://www.sthda.com/english/wiki/create-and-format-powerpoint-documents-from-r-software

url = "http://www.sthda.com/english/wiki/create-and-format-powerpoint-documents-from-r-software"
rquery.wordcloud(x=url, type="url")

< The above word cloud shows that “powerpoint”, “doc”, “slide”, “reporters” are among the most important words on the analyzed web page. This confirms the fact that the article is about creating PowerPoint document using ReporteRs package in R

R code of rquery.wordcloud function

#++++++++++++++++++++++++++++++++++
# rquery.wordcloud() : Word cloud generator
# - http://www.sthda.com
#+++++++++++++++++++++++++++++++++++
# x : character string (plain text, web url, txt file path)
# type : specify whether x is a plain text, a web page url or a file path
# lang : the language of the text
# excludeWords : a vector of words to exclude from the text
# textStemming : reduces words to their root form
# colorPalette : the name of color palette taken from RColorBrewer package, 
  # or a color name, or a color code
# min.freq : words with frequency below min.freq will not be plotted
# max.words : Maximum number of words to be plotted. least frequent terms dropped

# value returned by the function : a list(tdm, freqTable)
rquery.wordcloud <- function(x, type=c("text", "url", "file"), 
                          lang="english", excludeWords=NULL, 
                          textStemming=FALSE,  colorPalette="Dark2",
                          min.freq=3, max.words=200)
{ 
  library("tm")
  library("SnowballC")
  library("wordcloud")
  library("RColorBrewer") 

  if(type[1]=="file") text <- readLines(x)
  else if(type[1]=="url") text <- html_to_text(x)
  else if(type[1]=="text") text <- x

  # Load the text as a corpus
  docs <- Corpus(VectorSource(text))
  # Convert the text to lower case
  docs <- tm_map(docs, content_transformer(tolower))
  # Remove numbers
  docs <- tm_map(docs, removeNumbers)
  # Remove stopwords for the language 
  docs <- tm_map(docs, removeWords, stopwords(lang))
  # Remove punctuations
  docs <- tm_map(docs, removePunctuation)
  # Eliminate extra white spaces
  docs <- tm_map(docs, stripWhitespace)
  # Remove your own stopwords
  if(!is.null(excludeWords)) 
    docs <- tm_map(docs, removeWords, excludeWords) 
  # Text stemming
  if(textStemming) docs <- tm_map(docs, stemDocument)
  # Create term-document matrix
  tdm <- TermDocumentMatrix(docs)
  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  # check the color palette name 
  if(!colorPalette %in% rownames(brewer.pal.info)) colors = colorPalette
  else colors = brewer.pal(8, colorPalette) 
  # Plot the word cloud
  set.seed(1234)
  wordcloud(d$word,d$freq, min.freq=min.freq, max.words=max.words,
            random.order=FALSE, rot.per=0.35, 
            use.r.layout=FALSE, colors=colors)

  invisible(list(tdm=tdm, freqTable = d))
}

#++++++++++++++++++++++
# Helper function
#++++++++++++++++++++++
# Download and parse webpage
html_to_text<-function(url){
  library(RCurl)
  library(XML)
  # download html
  html.doc <- getURL(url)  
  #convert to plain text
  doc = htmlParse(html.doc, asText=TRUE)
 # "//text()" returns all text outside of HTML tags.
 # We also don’t want text such as style and script codes
  text <- xpathSApply(doc, "//text()[not(ancestor::script)][not(ancestor::style)][not(ancestor::noscript)][not(ancestor::form)]", xmlValue)
  # Format text vector into one character string
  return(paste(text, collapse = " "))
}


ZellW/LearnPlotting documentation built on May 10, 2019, 1:57 a.m.